/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(C) 2024 Marvell.
 */

#ifndef __CN20K_WORKER_H__
#define __CN20K_WORKER_H__

#include <rte_event_timer_adapter.h>
#include <rte_eventdev.h>

#include "cn20k_eventdev.h"
#include "cn20k_rx.h"
#include "cnxk_worker.h"

/* CN20K Rx event fastpath */

static __rte_always_inline void
cn20k_wqe_to_mbuf(uint64_t wqe, const uint64_t __mbuf, uint8_t port_id, const uint32_t tag,
		  const uint32_t flags, const void *const lookup_mem, uintptr_t cpth,
		  uintptr_t sa_base)
{
	const uint64_t mbuf_init =
		0x100010000ULL | RTE_PKTMBUF_HEADROOM | (flags & NIX_RX_OFFLOAD_TSTAMP_F ? 8 : 0);
	struct rte_mbuf *mbuf = (struct rte_mbuf *)__mbuf;

	cn20k_nix_cqe_to_mbuf((struct nix_cqe_hdr_s *)wqe, tag, (struct rte_mbuf *)mbuf, lookup_mem,
			      mbuf_init | ((uint64_t)port_id) << 48, cpth, sa_base, flags);
}

static void
cn20k_sso_process_tstamp(uint64_t u64, uint64_t mbuf, struct cnxk_timesync_info *tstamp)
{
	uint64_t tstamp_ptr;
	uint8_t laptr;

	laptr = (uint8_t)*(uint64_t *)(u64 + (CNXK_SSO_WQE_LAYR_PTR * sizeof(uint64_t)));
	if (laptr == sizeof(uint64_t)) {
		/* Extracting tstamp, if PTP enabled*/
		tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)u64) + CNXK_SSO_WQE_SG_PTR);
		cn20k_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp, true,
					 (uint64_t *)tstamp_ptr);
	}
}

static __rte_always_inline void
cn20k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags, struct cn20k_sso_hws *ws)
{
	uint64_t mbuf_init = 0x100010000ULL | RTE_PKTMBUF_HEADROOM;
	struct cnxk_timesync_info *tstamp = ws->tstamp[port_id];
	void *lookup_mem = ws->lookup_mem;
	uintptr_t lbase = ws->lmt_base;
	struct rte_event_vector *vec;
	uint16_t nb_mbufs, non_vec;
	struct rte_mbuf **wqe;
	struct rte_mbuf *mbuf;
	uint64_t sa_base = 0;
	uintptr_t cpth = 0;
	int i;

	mbuf_init |= ((uint64_t)port_id) << 48;
	vec = (struct rte_event_vector *)vwqe;
	wqe = vec->mbufs;

	rte_prefetch0(&vec->ptrs[0]);
#define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *))
	for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE)
		rte_prefetch0(&vec->ptrs[i]);

	if (flags & NIX_RX_OFFLOAD_TSTAMP_F && tstamp)
		mbuf_init |= 8;

	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
	nb_mbufs = cn20k_nix_recv_pkts_vector(&mbuf_init, wqe, nb_mbufs, flags | NIX_RX_VWQE_F,
					      lookup_mem, tstamp, lbase, 0);
	wqe += nb_mbufs;
	non_vec = vec->nb_elem - nb_mbufs;

	while (non_vec) {
		struct nix_cqe_hdr_s *cqe = (struct nix_cqe_hdr_s *)wqe[0];

		mbuf = (struct rte_mbuf *)((char *)cqe - sizeof(struct rte_mbuf));

		/* Mark mempool obj as "get" as it is alloc'ed by NIX */
		RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf, 1, 1);

		cn20k_nix_cqe_to_mbuf(cqe, cqe->tag, mbuf, lookup_mem, mbuf_init, cpth, sa_base,
				      flags);

		if (flags & NIX_RX_OFFLOAD_TSTAMP_F)
			cn20k_sso_process_tstamp((uint64_t)wqe[0], (uint64_t)mbuf, tstamp);
		wqe[0] = (struct rte_mbuf *)mbuf;
		non_vec--;
		wqe++;
	}
}

static __rte_always_inline void
cn20k_sso_hws_post_process(struct cn20k_sso_hws *ws, uint64_t *u64, const uint32_t flags)
{
	uintptr_t sa_base = 0;

	u64[0] = (u64[0] & (0x3ull << 32)) << 6 | (u64[0] & (0x3FFull << 36)) << 4 |
		 (u64[0] & 0xffffffff);
	if (CNXK_EVENT_TYPE_FROM_TAG(u64[0]) == RTE_EVENT_TYPE_ETHDEV) {
		uint8_t port = CNXK_SUB_EVENT_FROM_TAG(u64[0]);
		uintptr_t cpth = 0;
		uint64_t mbuf;

		mbuf = u64[1] - sizeof(struct rte_mbuf);
		rte_prefetch0((void *)mbuf);

		/* Mark mempool obj as "get" as it is alloc'ed by NIX */
		RTE_MEMPOOL_CHECK_COOKIES(((struct rte_mbuf *)mbuf)->pool, (void **)&mbuf, 1, 1);

		u64[0] = CNXK_CLR_SUB_EVENT(u64[0]);
		cn20k_wqe_to_mbuf(u64[1], mbuf, port, u64[0] & 0xFFFFF, flags, ws->lookup_mem, cpth,
				  sa_base);
		if (flags & NIX_RX_OFFLOAD_TSTAMP_F)
			cn20k_sso_process_tstamp(u64[1], mbuf, ws->tstamp[port]);
		u64[1] = mbuf;
	} else if (CNXK_EVENT_TYPE_FROM_TAG(u64[0]) == RTE_EVENT_TYPE_ETHDEV_VECTOR) {
		uint8_t port = CNXK_SUB_EVENT_FROM_TAG(u64[0]);
		__uint128_t vwqe_hdr = *(__uint128_t *)u64[1];

		vwqe_hdr = ((vwqe_hdr >> 64) & 0xFFF) | BIT_ULL(31) | ((vwqe_hdr & 0xFFFF) << 48) |
			   ((uint64_t)port << 32);
		*(uint64_t *)u64[1] = (uint64_t)vwqe_hdr;
		cn20k_process_vwqe(u64[1], port, flags, ws);
		/* Mark vector mempool object as get */
		RTE_MEMPOOL_CHECK_COOKIES(rte_mempool_from_obj((void *)u64[1]), (void **)&u64[1], 1,
					  1);
	} else if (CNXK_EVENT_TYPE_FROM_TAG(u64[0]) == RTE_EVENT_TYPE_TIMER) {
		struct rte_event_timer *tev = (struct rte_event_timer *)u64[1];

		tev->state = RTE_EVENT_TIMER_NOT_ARMED;
		u64[1] = tev->ev.u64;
	}
}

static __rte_always_inline uint16_t
cn20k_sso_hws_get_work(struct cn20k_sso_hws *ws, struct rte_event *ev, const uint32_t flags)
{
	union {
		__uint128_t get_work;
		uint64_t u64[2];
	} gw;

	gw.get_work = ws->gw_wdata;
#if defined(RTE_ARCH_ARM64)
#if defined(__clang__)
	register uint64_t x0 __asm("x0") = (uint64_t)gw.u64[0];
	register uint64_t x1 __asm("x1") = (uint64_t)gw.u64[1];
#if defined(RTE_ARM_USE_WFE)
	plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
	asm volatile(PLT_CPU_FEATURE_PREAMBLE
		     "		ldp %[x0], %[x1], [%[tag_loc]]	\n"
		     "		tbz %[x0], %[pend_gw], done%=	\n"
		     "		sevl					\n"
		     "rty%=:	wfe					\n"
		     "		ldp %[x0], %[x1], [%[tag_loc]]	\n"
		     "		tbnz %[x0], %[pend_gw], rty%=	\n"
		     "done%=:						\n"
		     "		dmb ld					\n"
		     : [x0] "+r" (x0), [x1] "+r" (x1)
		     : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0),
		       [pend_gw] "i"(SSOW_LF_GWS_TAG_PEND_GET_WORK_BIT)
		     : "memory");
#else
	asm volatile(".arch armv8-a+lse\n"
		     "caspal %[x0], %[x1], %[x0], %[x1], [%[dst]]\n"
		     : [x0] "+r" (x0), [x1] "+r" (x1)
		     : [dst] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
		     : "memory");
#endif
	gw.u64[0] = x0;
	gw.u64[1] = x1;
#else
#if defined(RTE_ARM_USE_WFE)
	plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
	asm volatile(PLT_CPU_FEATURE_PREAMBLE
		     "		ldp %[wdata], %H[wdata], [%[tag_loc]]	\n"
		     "		tbz %[wdata], %[pend_gw], done%=	\n"
		     "		sevl					\n"
		     "rty%=:	wfe					\n"
		     "		ldp %[wdata], %H[wdata], [%[tag_loc]]	\n"
		     "		tbnz %[wdata], %[pend_gw], rty%=	\n"
		     "done%=:						\n"
		     "		dmb ld					\n"
		     : [wdata] "=&r"(gw.get_work)
		     : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0),
		       [pend_gw] "i"(SSOW_LF_GWS_TAG_PEND_GET_WORK_BIT)
		     : "memory");
#else
	asm volatile(PLT_CPU_FEATURE_PREAMBLE
		     "caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
		     : [wdata] "+r"(gw.get_work)
		     : [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
		     : "memory");
#endif
#endif
#else
	plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
	do {
		roc_load_pair(gw.u64[0], gw.u64[1], ws->base + SSOW_LF_GWS_WQE0);
	} while (gw.u64[0] & BIT_ULL(63));
	rte_atomic_thread_fence(rte_memory_order_seq_cst);
#endif
	ws->gw_rdata = gw.u64[0];
	if (gw.u64[1])
		cn20k_sso_hws_post_process(ws, gw.u64, flags);

	ev->event = gw.u64[0];
	ev->u64 = gw.u64[1];

	return !!gw.u64[1];
}

/* Used in cleaning up workslot. */
static __rte_always_inline uint16_t
cn20k_sso_hws_get_work_empty(struct cn20k_sso_hws *ws, struct rte_event *ev, const uint32_t flags)
{
	union {
		__uint128_t get_work;
		uint64_t u64[2];
	} gw;

#ifdef RTE_ARCH_ARM64
	asm volatile(PLT_CPU_FEATURE_PREAMBLE
		     "		ldp %[tag], %[wqp], [%[tag_loc]]	\n"
		     "		tbz %[tag], 63, .Ldone%=		\n"
		     "		sevl					\n"
		     ".Lrty%=:	wfe					\n"
		     "		ldp %[tag], %[wqp], [%[tag_loc]]	\n"
		     "		tbnz %[tag], 63, .Lrty%=		\n"
		     ".Ldone%=:	dmb ld					\n"
		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
		     : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0)
		     : "memory");
#else
	do {
		roc_load_pair(gw.u64[0], gw.u64[1], ws->base + SSOW_LF_GWS_WQE0);
	} while (gw.u64[0] & BIT_ULL(63));
#endif

	ws->gw_rdata = gw.u64[0];
	if (gw.u64[1])
		cn20k_sso_hws_post_process(ws, gw.u64, flags);
	else
		gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
			    (gw.u64[0] & (0x3FFull << 36)) << 4 | (gw.u64[0] & 0xffffffff);

	ev->event = gw.u64[0];
	ev->u64 = gw.u64[1];

	return !!gw.u64[1];
}

/* CN20K Fastpath functions. */
uint16_t __rte_hot cn20k_sso_hws_enq_burst(void *port, const struct rte_event ev[],
					   uint16_t nb_events);
uint16_t __rte_hot cn20k_sso_hws_enq_new_burst(void *port, const struct rte_event ev[],
					       uint16_t nb_events);
uint16_t __rte_hot cn20k_sso_hws_enq_fwd_burst(void *port, const struct rte_event ev[],
					       uint16_t nb_events);
int __rte_hot cn20k_sso_hws_profile_switch(void *port, uint8_t profile);
int __rte_hot cn20k_sso_hws_preschedule_modify(void *port,
					       enum rte_event_dev_preschedule_type type);
void __rte_hot cn20k_sso_hws_preschedule(void *port, enum rte_event_dev_preschedule_type type);

#define R(name, flags)                                                                             \
	uint16_t __rte_hot cn20k_sso_hws_deq_burst_##name(                                         \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_deq_tmo_burst_##name(                                     \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_deq_ca_burst_##name(                                      \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_deq_tmo_ca_burst_##name(                                  \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_deq_seg_burst_##name(                                     \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_deq_tmo_seg_burst_##name(                                 \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_deq_ca_seg_burst_##name(                                  \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_deq_tmo_ca_seg_burst_##name(                              \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_reas_deq_burst_##name(                                    \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_reas_deq_tmo_burst_##name(                                \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_reas_deq_ca_burst_##name(                                 \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_reas_deq_tmo_ca_burst_##name(                             \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_reas_deq_seg_burst_##name(                                \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_reas_deq_tmo_seg_burst_##name(                            \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_reas_deq_ca_seg_burst_##name(                             \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);    \
	uint16_t __rte_hot cn20k_sso_hws_reas_deq_tmo_ca_seg_burst_##name(                         \
		void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks);

NIX_RX_FASTPATH_MODES
#undef R

#define SSO_DEQ(fn, flags)                                                                         \
	static __rte_always_inline uint16_t fn(void *port, struct rte_event *ev,                   \
					       uint64_t timeout_ticks)                             \
	{                                                                                          \
		struct cn20k_sso_hws *ws = port;                                                   \
		RTE_SET_USED(timeout_ticks);                                                       \
		if (ws->swtag_req) {                                                               \
			ws->swtag_req = 0;                                                         \
			ws->gw_rdata = cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);       \
			return 1;                                                                  \
		}                                                                                  \
		return cn20k_sso_hws_get_work(ws, ev, flags);                                      \
	}

#define SSO_DEQ_SEG(fn, flags) SSO_DEQ(fn, flags | NIX_RX_MULTI_SEG_F)

#define SSO_DEQ_TMO(fn, flags)                                                                     \
	static __rte_always_inline uint16_t fn(void *port, struct rte_event *ev,                   \
					       uint64_t timeout_ticks)                             \
	{                                                                                          \
		struct cn20k_sso_hws *ws = port;                                                   \
		uint16_t ret = 1;                                                                  \
		uint64_t iter;                                                                     \
		if (ws->swtag_req) {                                                               \
			ws->swtag_req = 0;                                                         \
			ws->gw_rdata = cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);       \
			return ret;                                                                \
		}                                                                                  \
		ret = cn20k_sso_hws_get_work(ws, ev, flags);                                       \
		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)                         \
			ret = cn20k_sso_hws_get_work(ws, ev, flags);                               \
		return ret;                                                                        \
	}

#define SSO_DEQ_TMO_SEG(fn, flags) SSO_DEQ_TMO(fn, flags | NIX_RX_MULTI_SEG_F)

#define R(name, flags)                                                                             \
	SSO_DEQ(cn20k_sso_hws_deq_##name, flags)                                                   \
	SSO_DEQ(cn20k_sso_hws_reas_deq_##name, flags | NIX_RX_REAS_F)                              \
	SSO_DEQ_SEG(cn20k_sso_hws_deq_seg_##name, flags)                                           \
	SSO_DEQ_SEG(cn20k_sso_hws_reas_deq_seg_##name, flags | NIX_RX_REAS_F)                      \
	SSO_DEQ_TMO(cn20k_sso_hws_deq_tmo_##name, flags)                                           \
	SSO_DEQ_TMO(cn20k_sso_hws_reas_deq_tmo_##name, flags | NIX_RX_REAS_F)                      \
	SSO_DEQ_TMO_SEG(cn20k_sso_hws_deq_tmo_seg_##name, flags)                                   \
	SSO_DEQ_TMO_SEG(cn20k_sso_hws_reas_deq_tmo_seg_##name, flags | NIX_RX_REAS_F)

NIX_RX_FASTPATH_MODES
#undef R

#define SSO_CMN_DEQ_BURST(fnb, fn, flags)                                                          \
	uint16_t __rte_hot fnb(void *port, struct rte_event ev[], uint16_t nb_events,              \
			       uint64_t timeout_ticks)                                             \
	{                                                                                          \
		RTE_SET_USED(nb_events);                                                           \
		return fn(port, ev, timeout_ticks);                                                \
	}

#define SSO_CMN_DEQ_SEG_BURST(fnb, fn, flags)                                                      \
	uint16_t __rte_hot fnb(void *port, struct rte_event ev[], uint16_t nb_events,              \
			       uint64_t timeout_ticks)                                             \
	{                                                                                          \
		RTE_SET_USED(nb_events);                                                           \
		return fn(port, ev, timeout_ticks);                                                \
	}

uint16_t __rte_hot cn20k_sso_hws_deq_burst_all_offload(void *port, struct rte_event ev[],
						       uint16_t nb_events, uint64_t timeout_ticks);
uint16_t __rte_hot cn20k_sso_hws_deq_burst_all_offload_tst(void *port, struct rte_event ev[],
							   uint16_t nb_events,
							   uint64_t timeout_ticks);

#endif
